Plotting for metric learning output

library(tidyverse)
library(dimRed)
library(reticulate)
library(here)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
library(ggforce)
library(ks)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)

Metric learning

Smart meter data for one household

# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
# 
# train <- spdemand %>%
#   lazy_dt() %>%
#   filter(tow <= ntow,
#          # id <= sort(unique(spdemand[,id]))[nid],
#          id == 1003) %>%
#   dplyr::select(-id, -tow) %>%
#   data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))

Radius searching with k-d trees

# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithm

The metric learning algorithm

metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
summary(metric_isomap)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4     
## laplacian      112896 dgCMatrix S4

The function metricML() returns a list of

  • the embedding coordinates embedding of \(N \times s\)
  • the embedding metric rmetric for each point \(p \in x\) as an array. By default, invert.h = TRUE returns the inverse of Riemannian metric.
  • the weighted neighborhood graph as an igraph object weighted_graph
  • the sparse adjacency matrix from the graph adj_matrix
  • the graph laplacian matrix laplacian

Embedding plot

# fn <- metric_isomap$embedding
# head(fn)
# plot(fn, col = viridis::viridis(24), asp = 1)
plot_embedding(metric_isomap) +
  labs(x = "ISO1", y = "ISO2")

Ellipse plot

Using the Riemannian metric for each point as the 2-d covariance matrix and the 2-d embeddings as the centroid to plot an ellipse for each point.

The underlying requirement is that the Riemannian metric needs to be a square positive definite matrix.

plot_ellipse(metric_isomap, add = F, n.plot = 50, scale = 20, 
             color = blues9[5], fill = blues9[5], alpha = 0.2)

# fn <- metric_isomap$embedding
# Rn <- metric_isomap$rmetric # array
# n.plot <- 50
# e <- riem2ellipse(Rn, scale = 20) %>% 
#   cbind(fn) %>% 
#   as_tibble()
# OR
# plot_embedding(metric_isomap, embedding = F) +
#   plot_ellipse(metric_isomap, add = T)
# OR
# # plot_embedding(fn, embedding = T) + 
#   ggplot(data = e, aes(E1, E2)) +
#   geom_point() +
#   geom_ellipse(data = slice_sample(e, n = n.plot), 
#                aes(x0 = E1, y0 = E2, a = a, b = b, angle = angle, group = -1), 
#                color = blues9[5], fill = blues9[5], alpha = 0.2)

Variable kernel density estimate

For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_{H_i} (x - X_i).\]

Outlier plot based on density estimates

Fixed bandwidth

# fixed bandwidth
fn <- metric_isomap$embedding
E1 <- fn[,1] # rename as Ed to match the aesthetics in plot_ellipse()
E2 <- fn[,2]
prob <- c(1, 50, 99)
p_outlier <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL) + 
  plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20, 
             color = blues9[5], fill = blues9[5], alpha = 0.2)
p_outlier

Pointwise adaptive bandwidth

Rn <- metric_isomap$rmetric # array
n.grid <- 10
f <- vkde2d(x = fn[,1], y = fn[,2], h = Rn, n = n.grid)
plot_contour(metric_isomap, n.grid = n.grid, scale = 1)

source(here::here("R/sources/hdrplotting.R"))
p_outlier_vkde <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob)
# den <- f
# x <- E1; y <- E2
#  <-  = c(50, 95, 99)
# # prob = (1:10)*10
# # Convert prob to coverage percentage if necessary
# if(max(prob) > 50) {# Assume prob is coverage percentage
#   alpha <- (100-prob)/100
# } else {# prob is tail probability (for backward compatibility)
#   alpha <- prob}
# alpha <- sort(alpha)
# # Calculates falpha needed to compute HDR of bivariate density den.
# # Also finds approximate mode.
# fxy <- hdrcde:::interp.2d(den$x,den$y,den$z,x,y)
# falpha <- quantile(fxy, alpha)
# index <- which.max(fxy)
# mode <- c(x[index],y[index])
# hdr2d_info <- structure(list(mode=mode,falpha=falpha,fxy=fxy, den=den, alpha=alpha, x=x, y=y), class="hdr2d") # list for hdr.2d() output
# # plot.hdr2d(hdr2d_info, show.points = T, outside.points = T, pointcol = grey(0.5), xlim = round(range(x)), ylim = round(range(y)))
# 
# p_outlier_vkde <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL, den = hdr2d_info)  + 
#   plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20, 
#              color = blues9[5], fill = blues9[5], alpha = 0.2)
library(patchwork)
(p_outlier + p_outlier_vkde ) + coord_fixed()

t-SNE

x <- train
method <- "tSNE"
perplexity <- 30
theta <- 0.5
metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
summary(metric_tsne)
##                Length Class     Mode   
## embedding         672 -none-    numeric
## rmetric          1344 -none-    numeric
## weighted_graph     10 igraph    list   
## adj_matrix     112896 dgCMatrix S4     
## laplacian      112896 dgCMatrix S4
ftsne <- metric_tsne$embedding
E1 <- ftsne[,1]; E2 <- ftsne[,2]
# plot_embedding(metric_tsne)
plot_ellipse(metric_tsne, n.plot = 50)

plot_contour(metric_tsne, n.grid = 20, scale = 1/8)

p_tsne <- plot_outlier(x = metric_tsne, n.grid = 20, prob = prob, noutliers = 20, scale = 1/8)
p_tsne

hdrscatterplot(E1, E2, kde.package = "ks", noutliers = 20) + 
  plot_ellipse(metric_tsne, n.plot = 50, add = T)

p_outlier_vkde + p_tsne